Personal income decreased $216.2 billion, or 1.0 percent at a monthly rate, while consumer spending increased $93.4 billion, or 0.6 percent, in September. The decrease in personal income primarily reflected the winding down of pandemic-related assistance programs.

TO DOs: * linear model: consumption ~ time, COVID-19

Data tidying

We are insterested in exploring the how the changes in the following variables in personal income might affect PCE:

  • Wages and salaries
  • Supplements to wages and salaries
  • Social Security
  • Medicare
  • Personal Current Taxes

v.s. * Personal consumption expenditures (on goods and services)

We want to look at data from 1967Q1 to 2020Q4

format_function <- function(input) {
  str_c(str_split(input, 'q')[[1]][1], ' Quarter ', str_split(input, 'q')[[1]][2])
}
pce_4720 <- read_excel("data/pce_1947_2020.xlsx", sheet = 'T20100-Q', 
                      range = 'A8:KM54') %>% 
  janitor::clean_names() %>%
  select(x2, x1967q1:x2020q4) %>%
  drop_na() %>%
  # We only want to look at several variables:
  filter(
      x2 == 'Wages and salaries' |
      x2 == 'Supplements to wages and salaries' |
      x2 == 'Social security' |
      x2 == 'Medicare' |
      x2 == 'Less: Personal current taxes' |
      x2 == 'Personal consumption expenditures'
  ) %>%
  pivot_longer(
    x1967q1:x2020q4,
    names_to = 'time',
    names_prefix = 'x',
    values_to = 'millions_of_dollars'
  ) %>%
  rename(variable = x2) %>%
  mutate(
    variable = ifelse(variable == 'Less: Personal current taxes', 'Personal current taxes', variable)
  )
  #separate(time, c("year", "quarter"), "q") %>%
  
pce_4720_formatted <- pce_4720 %>%
  mutate(
    time = map(.x = time, format_function)
  )

Overview

Let’s take a brief look at the changes of variables over time

fig_1 <- pce_4720_formatted %>%
  plot_ly(x = ~time, y = ~millions_of_dollars, type = "scatter", mode = "lines", color = ~variable) %>%
  layout(title = "<b> Spaghetti Plot for Personal Income and Dispositions v.s. PCE <b>", yaixs = list(title = "<i> Millions of Dollars <i>"), barmode = "stack", legend = list(title = list(text = '<b> Dispositions </b>'))) %>% 
  layout(legend = list(orientation = 'h', x = 0, y = -0.2))

fig_1

A log graph will be more informative

fig_2 <- pce_4720_formatted %>%
  plot_ly(x = ~time, y = ~log(millions_of_dollars), type = "scatter", mode = "lines", color = ~variable) %>%
  layout(title = "<b> Spaghetti Plot for log(Personal Income and Dispositions) v.s. PCE </b>", yaixs = list(title = "log(Millions of Dollars)"), barmode = "stack", legend = list(title = list(text = '<i> Dispositions </i>'))) %>% 
  layout(legend = list(orientation = 'h', x = 0, y = -0.2))

fig_2

Model

Now let’s look at data pre- v.s. post-COVID-19 Pandemic

Fit a multivariable model based on pre-COVID-19 Pandemic

pce_4718_by_dis <-
  pce_4720 %>%
  filter(!str_detect(time, '2019|2020')) %>%
  pivot_wider(
    names_from = variable,
    values_from = millions_of_dollars
  ) %>%
  janitor::clean_names()
  

smooth_mod = mgcv::gam(
  log(personal_consumption_expenditures) ~ s(log(wages_and_salaries)) + 
    s(log(social_security)) + s(log(supplements_to_wages_and_salaries)) + s(log(personal_current_taxes)) + s(log(medicare)), 
  data = pce_4718_by_dis)

summary(smooth_mod)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(personal_consumption_expenditures) ~ s(log(wages_and_salaries)) + 
##     s(log(social_security)) + s(log(supplements_to_wages_and_salaries)) + 
##     s(log(personal_current_taxes)) + s(log(medicare))
## 
## Parametric coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.510e+01  3.668e-04   41171   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                             edf Ref.df       F p-value    
## s(log(wages_and_salaries))                1.000  1.000 104.907 < 2e-16 ***
## s(log(social_security))                   3.201  4.180   4.120 0.00507 ** 
## s(log(supplements_to_wages_and_salaries)) 9.000  9.000  21.982 < 2e-16 ***
## s(log(personal_current_taxes))            8.153  8.785   6.871 < 2e-16 ***
## s(log(medicare))                          8.815  8.973   8.551 < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =      1   Deviance explained =  100%
## GCV = 3.2911e-05  Scale est. = 2.7979e-05  n = 208

Model Evaluation

smooth_mod %>% broom::tidy()
## # A tibble: 5 Ă— 5
##   term                                        edf ref.df statistic p.value
##   <chr>                                     <dbl>  <dbl>     <dbl>   <dbl>
## 1 s(log(wages_and_salaries))                 1.00   1.00    105.   0      
## 2 s(log(social_security))                    3.20   4.18      4.12 0.00507
## 3 s(log(supplements_to_wages_and_salaries))  9.00   9.00     22.0  0      
## 4 s(log(personal_current_taxes))             8.15   8.79      6.87 0      
## 5 s(log(medicare))                           8.82   8.97      8.55 0
smooth_mod.fit <- mgcViz::getViz(smooth_mod)
qqplot <- mgcViz::qq(smooth_mod.fit, method = "simul1", a.qqpoi = list("shape" = 1), a.ablin = list("linetype" = 2))
qqplot

Now let’s predict the PCE using our model

pce_2019 <-
  pce_4720 %>%
  filter(
    str_detect(time, '2019')
  ) %>%
  pivot_wider(
    names_from = variable,
    values_from = millions_of_dollars
  ) %>%
  janitor::clean_names()
  
pce_2019_pred <- predict.gam(smooth_mod,pce_2019)
pce_2019_pred
##        1        2        3        4 
## 16.48962 16.49738 16.50608 16.51574
log(pce_2019$personal_consumption_expenditures)
## [1] 16.47413 16.48947 16.49963 16.50738

#RMSQ
rmse(smooth_mod, pce_2019)
## [1] 0.01017228
pce_2020 <-
  pce_4720 %>%
  filter(
    str_detect(time, '2020')
  ) %>%
  pivot_wider(
    names_from = variable,
    values_from = millions_of_dollars
  ) %>%
  janitor::clean_names()
  
pce_2020_pred <- predict.gam(smooth_mod,pce_2020)
pce_2020_pred
##        1        2        3        4 
## 16.52307 16.50159 16.53446 16.55342
log(pce_2020$personal_consumption_expenditures)
## [1] 16.49279 16.38792 16.48285 16.49221

#RMSQ
rmse(smooth_mod, pce_2020)
## [1] 0.07114881

Testing on the difference

Hypothesize: there is significant difference between our model prediction for pre and post pandemic periods

Since there is not enough dataset for post pandemic, we need to bootstrap a set of new samples from year 2020 and 2021

# Bootstrapping? Testing on 1967-2016
# include dataset from 2021q1, 2021q2, 2021q3
pce_4720_raw <- read_excel("data/pce_1947_2020.xlsx", sheet = 'T20100-Q', 
                      range = 'A8:KM54') %>% 
  janitor::clean_names() %>%
  select(x2, x1967q1:x2020q4) %>%
  drop_na() %>%
  # We only want to look at several variables:
  filter(
      x2 == 'Wages and salaries' |
      x2 == 'Supplements to wages and salaries' |
      x2 == 'Social security' |
      x2 == 'Medicare' |
      x2 == 'Less: Personal current taxes' |
      x2 == 'Personal consumption expenditures'
  ) %>%
  rename(variable = x2)

rownames(pce_4720_raw) <- pull(pce_4720_raw, variable)

pce_4720_raw <- mutate(pce_4720_raw, variable = NULL)

pce_4720_wide <- as.data.frame(t(as.matrix(pce_4720_raw))) %>%
  janitor::clean_names() %>%
  rename(personal_current_taxes = less_personal_current_taxes)

Select samples data set

pce_2019_df <- subset(pce_4720_wide, str_detect(rownames(pce_4720_wide), '2019'))
pce_2020_df <- subset(pce_4720_wide, str_detect(rownames(pce_4720_wide), '2020'))

bootstrapping

boot_straps_19 = 
  pce_2019_df %>% 
  modelr::bootstrap(n = 1000)

boot_straps_20 = 
  pce_2020_df %>% 
  modelr::bootstrap(n = 1000)

Compute rmse

rmse_compute <- function(data){
  data <- as.data.frame(data)
  return(rmse(smooth_mod, data))
}
rmse_19 <- boot_straps_19 %>%
  mutate(
    rmse = map_dbl(strap, rmse_compute)
  ) %>%
  select(rmse)
  
rmse_20 <- boot_straps_20 %>%
  mutate(
    rmse = map_dbl(strap, rmse_compute)
  ) %>%
  select(rmse)          

Perform test

t.test(rmse_19,rmse_20)
## 
##  Welch Two Sample t-test
## 
## data:  rmse_19 and rmse_20
## t = -108.23, df = 1026.6, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.05852254 -0.05643821
## sample estimates:
##   mean of x   mean of y 
## 0.009950766 0.067431140

We reject the null hypothesis and conclude that there is significant difference between our prediction of personal consumption expenditures for year 2019 and year 2020.